# initiation --------------------------------------------------------------
library(sandwich)
library(lmtest)
library(tidyverse)
library(readr)
library(ggpubr)
library(marginaleffects)
library(vtable)
library(stargazer)
library(interflex)
library(ggridges)

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

data <- read_csv("data_replication.csv")
data <- as.data.frame(data)
options(scipen=999)
controls <- c("agegroup", 
              "male", 
              "higher_ed", 
              "financial_situation_scale", 
              "large_city", 
              "vpn_user", 
              "ethnicity", 
              "children",
              "language_russian", 
              "government_employee",
              "region", "confession", "kazakh_media", "media_abroad")


# cleaning ----------------------------------------------------------------
data <- data %>% 
  mutate(
    ethnicity = relevel(as.factor(ethnicity), ref = "Kazakh"),
    region = as.factor(region),
    confession = as.factor(confession),
    children = as.factor(children),
    agegroup = as.factor(agegroup)
  )
data <- data %>% 
  mutate(
    treatment = relevel(as.factor(treatment), ref = "control"))


data_long <- pivot_longer(data, cols =c("direct_list_1_refuse","direct_list_2_refuse","direct_list_3_refuse","direct_list_4_refuse"),
                          names_to = "question",
                          values_to = "refuse",
                          values_drop_na = TRUE)
data_long <- data_long %>% 
  mutate(
    question = relevel(as.factor(question), ref = "direct_list_4_refuse"))

data_long <- as.data.frame(data_long)
data_long_3 <- subset(data_long, question!="direct_list_4_refuse")

# Table 1 -----------------------------------------------------------------

long3_lm_cov <- lm(refuse ~ question + treatment+
                     media_abroad+ 
                     higher_ed + 
                     agegroup + 
                     male + 
                     financial_situation_scale + 
                     large_city + 
                     vpn_user + 
                     ethnicity + confession + children + 
                     language_russian + 
                     government_employee +
                     region, data= data_long_3, weights=weights)



long3_lm <- lm(refuse ~ question + treatment, data= data_long_3, weights=weights)

long3_lm_cov_int <- lm(refuse ~ question + treatment*
                         media_abroad+ 
                         higher_ed + 
                         agegroup + 
                         male + 
                         financial_situation_scale + 
                         large_city + 
                         vpn_user + 
                         ethnicity + confession + children + 
                         language_russian + 
                         government_employee +
                         region, data= data_long_3, weights=weights)
long3_lm_cov_int_ed <- lm(refuse ~ question + treatment *
                            higher_ed + 
                            media_abroad+ 
                            agegroup + 
                            male + 
                            financial_situation_scale + 
                            large_city + 
                            vpn_user + 
                            ethnicity + confession + children + 
                            language_russian + 
                            government_employee +
                            region, data= data_long_3, weights=weights)
long_lm_cov <- lm(refuse ~ question + treatment+
                    media_abroad+ 
                    higher_ed + 
                    agegroup + 
                    male + 
                    financial_situation_scale + 
                    large_city + 
                    vpn_user + 
                    ethnicity + confession + children + 
                    language_russian + 
                    government_employee +
                    region, data= data_long, weights=weights)



# Calculate robust standard errors using the HC1 estimator
long3_lm <- coeftest(long3_lm, vcov = vcovHC(long3_lm, type = "HC1"))
long3_lm_cov <- coeftest(long3_lm_cov, vcov = vcovHC(long3_lm_cov, type = "HC1"))
long3_lm_cov_int <- coeftest(long3_lm_cov_int, vcov = vcovHC(long3_lm_cov_int, type = "HC1"))
long3_lm_cov_int_ed <- coeftest(long3_lm_cov_int_ed, vcov = vcovHC(long3_lm_cov_int_ed, type = "HC1"))


stargazer(long3_lm,
          long3_lm_cov,
          long3_lm_cov_int,
          long3_lm_cov_int_ed,
          omit.stat = c("ser", "f"),
          single.row = F,
          no.space=TRUE,
          type="text",
          dep.var.labels.include = FALSE, 
          dep.var.caption = "``Prefer not to answer''",
          keep = c("treatmentprivacy","treatmentsurveillance","media_abroad","higher_ed","treatmentprivacy:media_abroad","treatmentsurveillance:media_abroad","higher_ed:media_abroad","higher_ed:media_abroad"),
          add.lines =list(c("Region FE","No","Yes","Yes","Yes"),
                          c("Controls", "No","Yes","Yes","Yes")),
          covariate.labels=c("Privacy","Surveillance","International Media","Higher Education", "Privacy X International Media", "Surveillance X International Media", "Privacy X Higher Education", "Surveillance X Higher Education"),
          column.sep.width = "1pt", notes.label = "\\parbox{\\linewidth}{\\textit{Notes:} The dependent variable is answering \\textit{prefer not to answer} to a sensitive question. Robust standard errors in parentheses. Weighting was applied. 
                                                    The list of control variables includes: age group, gender, financial situation,
                                                    city size, VPN usage, ethnicity, number of children, Russian language proficiency, 
                                                    government employment status, region, religious affiliation, and consumption of Kazakh media.}")


# Table A1 ----------------------------------------------------------------


st(data, 
   vars= c("agegroup", 
           "male", 
           "higher_ed", 
           "financial_situation_scale", 
           "large_city", 
           "vpn_user", 
           "ethnicity", 
           "children",
           "language_russian", 
           "government_employee", "media_abroad"),
   summ = c('notNA(x)', 'mean(x)', 'sd(x)'), 
   summ.names=c('N','Mean', 'SD'),
   title='Summary statistics',
   out ='latex', 
   file='summary_statistics',
   factor.numeric=T,
   digits = 3)



# Table A2 ----------------------------------------------------------------



prot <- lm(direct_list_1_refuse ~ treatment, data= data, weights=weights)
sanc <- lm(direct_list_2_refuse ~ treatment, data= data, weights=weights)
inva <- lm(direct_list_3_refuse ~ treatment, data= data, weights=weights)
work <- lm(direct_list_4_refuse ~ treatment, data= data, weights=weights)
stargazer(prot,
          sanc,
          inva,
          work,
          type="text", 
          single.row = F,
          column.labels =c('Protest','Sanction evasion','Invasion', 'Placebo'),
          no.space=TRUE,
          dep.var.labels.include = FALSE,
          keep=c("treatmentprivacy","treatmentsurveillance","Constant"), 
          label = "reg_refuse",
          covariate.labels=c("Privacy","Surveillance","Constant"),
          column.sep.width = "1pt")

# Figure 1 ----------------------------------------------------------------



direct_1_plot <- ggplot(data, aes( x= as.factor(direct_list_1),  group=factor(treatment, levels=c("control", "surveillance", "privacy")))) + 
  geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count", color="black") +
  geom_text(aes( label = scales::percent(..prop.., accuracy = .1),
                 y= ..prop.. ), stat= "count", vjust = -.5, size=4) +
  scale_fill_viridis_d(option="mako", begin = 0, end = 0.7,labels=c("Justified", "Not justified", "Prefer not to answer"))+
  labs(y = "", fill="In your opinion, is [item] generally justified or not justified? ",x="Political protest") +
  facet_grid(~factor(treatment, levels=c("control", "surveillance", "privacy"))) +
  scale_y_continuous(limits = c(0, .55), breaks = seq(0, .5, by = .1),labels = scales::percent) +
  theme_bw()+
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) 
direct_2_plot <- ggplot(data, aes( x= as.factor(direct_list_2),  group=factor(treatment, levels=c("control", "surveillance", "privacy")))) + 
  geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count", color="black") +
  geom_text(aes( label = scales::percent(..prop.., accuracy = .1),
                 y= ..prop.. ), stat= "count", vjust = -.5, size=4) +
  scale_fill_viridis_d(option="mako", begin = 0, end = 0.7,labels=c("Justified", "Not justified", "Prefer not to answer"))+
  labs(y = "", fill="In your opinion, is [item] generally justified or not justified? ",x="Avoiding sanctions") +
  facet_grid(~factor(treatment, levels=c("control", "surveillance", "privacy"))) +
  scale_y_continuous(limits = c(0, .55), breaks = seq(0, .5, by = .1),labels = scales::percent) +
  theme_bw()+
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) 
direct_3_plot <- ggplot(data, aes( x= as.factor(direct_list_3),  group=factor(treatment, levels=c("control", "surveillance", "privacy")))) + 
  geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count", color="black") +
  geom_text(aes( label = scales::percent(..prop.., accuracy = .1),
                 y= ..prop.. ), stat= "count", vjust = -.5, size=4) +
  scale_fill_viridis_d(option="mako", begin = 0, end = 0.7,labels=c("Justified", "Not justified", "Prefer not to answer"))+
  labs(y = "", fill="In your opinion, is [item] generally justified or not justified? ",x="Invading Ukraine/SMO") +
  facet_grid(~factor(treatment, levels=c("control", "surveillance", "privacy"))) +
  scale_y_continuous(limits = c(0, .55), breaks = seq(0, .5, by = .1),labels = scales::percent) +
  theme_bw()+
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) 
direct_4_plot <- ggplot(data, aes( x= as.factor(direct_list_4),  group=factor(treatment, levels=c("control", "surveillance", "privacy")))) + 
  geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count", color="black") +
  geom_text(aes( label = scales::percent(..prop.., accuracy = .1),
                 y= ..prop.. ), stat= "count", vjust = -.5, size=4) +
  scale_fill_viridis_d(option="mako", begin = 0, end = 0.7,labels=c("Justified", "Not justified", "Prefer not to answer"))+
  labs(y = "", fill=" ", x="Placebo: Working more than 50 hours per week") +
  facet_grid(~factor(treatment, levels=c("control", "surveillance", "privacy"))) +
  scale_y_continuous(limits = c(0, .55), breaks = seq(0, .5, by = .1),labels = scales::percent) +
  theme_bw()+
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())


result_plots <- ggarrange(direct_1_plot,direct_2_plot,direct_3_plot,direct_4_plot, ncol = 2, nrow = 2, common.legend = TRUE)

result_plots
ggsave(file="means_dv_treatment.jpg", width = 10, height = 5,units = 'in')




# Figure 2 ----------------------------------------------------------------
long_if_refuse_bin <- interflex(estimator = 'binning', Y = "refuse", D = "treatment", X = "media_abroad", 
                                data = subset(data_long_3, treatment!="privacy"), 
                                Xunif = F, 
                                cutoffs = c(0.5),
                                Z= controls, 
                                na.rm = TRUE, 
                                theme.bw = TRUE, 
                                show.grid = T,
                                Xlabel="International media",
                                vcov.type = "robust", 
                                bin.labs = F,
                                ylab = "Marginal Effect",
                                weights = "weights")
long_if_refuse_bin$figure
ggsave(file="media_int.jpg", width = 6, height = 4,units = 'in')





# Figure 3 ----------------------------------------------------------------
long_if_refuse_bin_ed <- interflex(estimator = 'binning', Y = "refuse", D = "treatment", X = "higher_ed", 
                                   data = subset(data_long_3, treatment!="privacy"), 
                                   Xunif = F, 
                                   cutoffs = c(0.5),
                                   Z= controls, 
                                   na.rm = TRUE, 
                                   theme.bw = TRUE, 
                                   show.grid = T,
                                   Xlabel="Higher education",
                                   vcov.type = "robust", 
                                   bin.labs = F,
                                   ylab = "Marginal Effect",
                                   weights = "weights")
long_if_refuse_bin_ed$figure
ggsave(file="education_int.jpg", width = 6, height = 4,units = 'in')



# Figure A2 ---------------------------------------------------------------
# Pivot the media origin columns into long format
data_long_media <- data %>%
  pivot_longer(
    # Select all columns that have the pattern "mediaType_origin_region"
    cols = matches("^(facebook|instagram|mail_ru|moi_mir|odnoklassniki|other|telegram|tiktok|tv|twitter|vkontakte|websites|yandex|youtube)_origin_.*"),
    names_to = c("media_type", "origin"),
    names_pattern = "(.*)_origin_(.*)",
    values_to = "consumption"
  ) %>%
  filter(consumption == 1) 


data_long_media %>% 
  mutate(origin = recode(origin, "dk" = "unknown"),
         media_type = fct_infreq(media_type)) %>% 
  ggplot(aes(media_type, fill = origin)) +
  geom_bar(position = "dodge") +
  theme_bw() +  
  labs(x = "Media source", y = "Count", fill="Origin") +
  scale_fill_viridis_d(begin = 0, end = 1) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


# Figure A3 ---------------------------------------------------------------
# Long-format the media origin variables
media_long <- data %>%
  select(kazakh_media, russian_media_high, west_media, turkey_media, other_media) %>%
  pivot_longer(
    cols = everything(),
    names_to = "media_origin",
    values_to = "consumption"
  ) %>%
  mutate(
    media_origin = recode(
      media_origin,
      kazakh_media = "Kazakh",
      russian_media_high = "Russian",
      west_media = "Western",
      turkey_media = "Turkish",
      other_media = "Other"
    )
  )


# Ridgeline "histogram" showing distribution of 0/1 responses
ggplot(media_long, aes(x = consumption, y = media_origin, fill = media_origin)) +
  geom_density_ridges(
    alpha = 0.85,
    bandwidth = 0.15,
    scale = 0.9
  ) +
  theme_bw() +
  scale_fill_viridis_d(begin = 0, end = 0.8) +
  scale_x_continuous(
    breaks = c(0, 1),        
    limits = c(-0.45, 1.45), 
    labels = c("No", "Yes")
  ) +
  labs(
    title = "Distribution of Media Origin Consumption",
    subtitle = "Ridgeline representation of binary (0/1) media origin indicators",
    x = "Consumes this media type?",
    y = "Media origin"
  ) +
  theme(
    legend.position = "none",
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank()
  )



# Figure A4 ---------------------------------------------------------------


data %>% 
  ggplot(aes(education, fill=factor(higher_ed)))+
  geom_bar(position = "dodge") +
  theme_bw() +  
  labs(x = "Level of education", y = "Count",fill="Higher\nEducation\nDummy") +
  scale_fill_viridis_d() +
  coord_flip()+
  scale_x_discrete(labels = function(x) str_wrap(x, width = 50)) 

